library(readr)
library(stats)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
# Loading Dataset
titanic=read_csv('C:/Users/Admin/Desktop/titanic.csv')
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Checking the structure of dataset
str(titanic)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Summary statistics
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
head(titanic)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
# converting the numeric survive indicator to factor/categorical
titanic$Survived=ifelse(titanic$Survived==1,'Yes','No')
titanic$Survived=as.factor(titanic$Survived)
head(titanic)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <fct> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 No 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 Yes 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 Yes 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 Yes 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 No 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 No 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
# Converting categorical columns from int to factors
titanic$Pclass=as.factor(titanic$Pclass)
titanic$SibSp=as.factor(titanic$SibSp)
titanic$Parch=as.factor(titanic$Parch)
head(titanic)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <fct> <fct> <chr> <chr> <dbl> <fct> <fct> <chr> <dbl> <chr>
## 1 1 No 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 Yes 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 Yes 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 Yes 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 No 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 No 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
sum(is.na(titanic$Age))
## [1] 177
# from the above line we can see that 'Age' column has 177 missing values
# We can choose either mean or median method to fillin the values
# Mean might not give accurate results, can fill the values with median.
titanic$Age[is.na(titanic$Age)] <- round(median(titanic$Age,
na.rm = TRUE))
sum(is.na(titanic$Age))
## [1] 0
# Now there are no missing values in Age column
# Exploratory Data Analysis
#Univariate Analysis
#Question
#How any passengers are travelling in each class?
ggplot(data=titanic, aes(x=Pclass, fill = Pclass)) +
geom_bar(position = "dodge") +
geom_text(stat='count', aes(label=..count..), position = position_dodge(0.9),vjust=-0.2) +
ylab("Number of Passengers")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# We can see most of the passengers are from class 3 (cheapest of all)
# Suprisingly there are more passengers in Class-1, than compared to Class-2
# Question
# How many were survived?
ggplot(data=titanic, aes(x=Survived, fill = Survived)) +
geom_bar(position = "dodge") +
geom_text(stat='count', aes(label=..count..), position = position_dodge(0.9),vjust=-0.2) +
ylab("Number of Passengers")

# Here we can see there are 342 that are survived, and 549 passengers who lost their lives
#Question
# How is the price ranged?
ggplot(data=titanic, aes(x=Fare,)) +
geom_histogram(binwidth = 15) +
xlab("Fare")

# The tickets fare are consistent with the ticket class for sure, as the highest number of ticket purchased is the cheapest one offered to board the Titanic.
# BiVariate Analysis
# Question
# Which class passengers has high survival chance
ggplot(titanic, aes(x=Pclass,fill=Survived))+ geom_bar(position = "dodge") + geom_text(stat='count',aes(label=..count..),position = position_dodge(0.9),vjust=-0.2) +
ylab("Number of Passengers") + xlab("Passenger Class")

# From the chart we can conclude that people who paid more ie, Class-1 had much better chance of survival as compared to others.
# Question
# what ticket was selected by what age of passengers?
ggplot(titanic) + geom_freqpoly(mapping = aes(x = Age, color = Pclass), binwidth = 2.5) +
ylab("Frequency")

# HEre, we can see that most of the passengers irrespective of ticket class are almost of similar age (approx 30)
# Age Distribution by passenger class and sex
ggplot(titanic, aes(x=factor(Pclass), y=Age, fill=Sex)) +
geom_boxplot() +
facet_grid(Sex ~ .) +
scale_fill_manual(values=c("lightblue", "pink"), name="Sex") +
labs(title="Age Distribution by Passenger Class and Sex",
x="Passenger Class",
y="Age") +
theme_minimal()

# This faceted boxplot also shows almost similar that most of the passengers
# were from age 20-40 including all the ticket classes.
# Boxplot of fare and survived status
boxplot(Fare ~ Survived, data=titanic, main="Fare by Survival Status", xlab="Survived (0 = No, 1 = Yes)", ylab="Fare", col="lightgreen")

# Comparison of Age and Fare
plot(titanic$Age, titanic$Fare, main="Age vs. Fare", xlab="Age", ylab="Fare", col="blue")

# Scatterplot matrix for the numerical columns in the data
numeric_vars <- titanic[, c("Survived", "Pclass", "Age", "SibSp", "Parch", "Fare")]
pairs(numeric_vars)

# Question
# What percent of people were survived that were embarked at different ports
ggplot(titanic, aes(x=factor(Survived), fill=factor(Survived))) +
geom_bar() +
facet_grid(Pclass ~ Embarked) +
scale_fill_manual(values=c("lightblue", "pink"), name="Survived") +
labs(title="Survival Counts by Passenger Class and Embarked Port",
x="Survived (0 = No, 1 = Yes)",
y="Count") +
theme_minimal()

# HEre we can conclude that passengers that embarked at port Southampton wer the ones who survivied more and also they are the ones who lost their lives
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#3D scatterplot using plotly
plot_ly(data = titanic, x = ~Age, y = ~Fare, z = ~Survived, color = ~Survived, colors = c("pink", "lightblue")) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = "Age"),
yaxis = list(title = "Fare"),
zaxis = list(title = "Survived")))